In [7]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import RandomizedSearchCV
import plotly.express as px
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
In [8]:
df=pd.read_csv('athlete_events.csv')
In [9]:
df
Out[9]:
ID Name Sex Age Height Weight Team NOC Games Year Season City Sport Event Medal
0 1 A Dijiang M 24.0 180.0 80.0 China CHN 1992 Summer 1992 Summer Barcelona Basketball Basketball Men's Basketball NaN
1 2 A Lamusi M 23.0 170.0 60.0 China CHN 2012 Summer 2012 Summer London Judo Judo Men's Extra-Lightweight NaN
2 3 Gunnar Nielsen Aaby M 24.0 NaN NaN Denmark DEN 1920 Summer 1920 Summer Antwerpen Football Football Men's Football NaN
3 4 Edgar Lindenau Aabye M 34.0 NaN NaN Denmark/Sweden DEN 1900 Summer 1900 Summer Paris Tug-Of-War Tug-Of-War Men's Tug-Of-War Gold
4 5 Christine Jacoba Aaftink F 21.0 185.0 82.0 Netherlands NED 1988 Winter 1988 Winter Calgary Speed Skating Speed Skating Women's 500 metres NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
271111 135569 Andrzej ya M 29.0 179.0 89.0 Poland-1 POL 1976 Winter 1976 Winter Innsbruck Luge Luge Mixed (Men)'s Doubles NaN
271112 135570 Piotr ya M 27.0 176.0 59.0 Poland POL 2014 Winter 2014 Winter Sochi Ski Jumping Ski Jumping Men's Large Hill, Individual NaN
271113 135570 Piotr ya M 27.0 176.0 59.0 Poland POL 2014 Winter 2014 Winter Sochi Ski Jumping Ski Jumping Men's Large Hill, Team NaN
271114 135571 Tomasz Ireneusz ya M 30.0 185.0 96.0 Poland POL 1998 Winter 1998 Winter Nagano Bobsleigh Bobsleigh Men's Four NaN
271115 135571 Tomasz Ireneusz ya M 34.0 185.0 96.0 Poland POL 2002 Winter 2002 Winter Salt Lake City Bobsleigh Bobsleigh Men's Four NaN

271116 rows × 15 columns

In [10]:
df.head()
Out[10]:
ID Name Sex Age Height Weight Team NOC Games Year Season City Sport Event Medal
0 1 A Dijiang M 24.0 180.0 80.0 China CHN 1992 Summer 1992 Summer Barcelona Basketball Basketball Men's Basketball NaN
1 2 A Lamusi M 23.0 170.0 60.0 China CHN 2012 Summer 2012 Summer London Judo Judo Men's Extra-Lightweight NaN
2 3 Gunnar Nielsen Aaby M 24.0 NaN NaN Denmark DEN 1920 Summer 1920 Summer Antwerpen Football Football Men's Football NaN
3 4 Edgar Lindenau Aabye M 34.0 NaN NaN Denmark/Sweden DEN 1900 Summer 1900 Summer Paris Tug-Of-War Tug-Of-War Men's Tug-Of-War Gold
4 5 Christine Jacoba Aaftink F 21.0 185.0 82.0 Netherlands NED 1988 Winter 1988 Winter Calgary Speed Skating Speed Skating Women's 500 metres NaN
In [11]:
df.tail()
Out[11]:
ID Name Sex Age Height Weight Team NOC Games Year Season City Sport Event Medal
271111 135569 Andrzej ya M 29.0 179.0 89.0 Poland-1 POL 1976 Winter 1976 Winter Innsbruck Luge Luge Mixed (Men)'s Doubles NaN
271112 135570 Piotr ya M 27.0 176.0 59.0 Poland POL 2014 Winter 2014 Winter Sochi Ski Jumping Ski Jumping Men's Large Hill, Individual NaN
271113 135570 Piotr ya M 27.0 176.0 59.0 Poland POL 2014 Winter 2014 Winter Sochi Ski Jumping Ski Jumping Men's Large Hill, Team NaN
271114 135571 Tomasz Ireneusz ya M 30.0 185.0 96.0 Poland POL 1998 Winter 1998 Winter Nagano Bobsleigh Bobsleigh Men's Four NaN
271115 135571 Tomasz Ireneusz ya M 34.0 185.0 96.0 Poland POL 2002 Winter 2002 Winter Salt Lake City Bobsleigh Bobsleigh Men's Four NaN
In [12]:
df.describe()
Out[12]:
ID Age Height Weight Year
count 271116.000000 261642.000000 210945.000000 208241.000000 271116.000000
mean 68248.954396 25.556898 175.338970 70.702393 1978.378480
std 39022.286345 6.393561 10.518462 14.348020 29.877632
min 1.000000 10.000000 127.000000 25.000000 1896.000000
25% 34643.000000 21.000000 168.000000 60.000000 1960.000000
50% 68205.000000 24.000000 175.000000 70.000000 1988.000000
75% 102097.250000 28.000000 183.000000 79.000000 2002.000000
max 135571.000000 97.000000 226.000000 214.000000 2016.000000
In [13]:
df.info
Out[13]:
<bound method DataFrame.info of             ID                      Name Sex   Age  Height  Weight  \
0            1                 A Dijiang   M  24.0   180.0    80.0   
1            2                  A Lamusi   M  23.0   170.0    60.0   
2            3       Gunnar Nielsen Aaby   M  24.0     NaN     NaN   
3            4      Edgar Lindenau Aabye   M  34.0     NaN     NaN   
4            5  Christine Jacoba Aaftink   F  21.0   185.0    82.0   
...        ...                       ...  ..   ...     ...     ...   
271111  135569                Andrzej ya   M  29.0   179.0    89.0   
271112  135570                  Piotr ya   M  27.0   176.0    59.0   
271113  135570                  Piotr ya   M  27.0   176.0    59.0   
271114  135571        Tomasz Ireneusz ya   M  30.0   185.0    96.0   
271115  135571        Tomasz Ireneusz ya   M  34.0   185.0    96.0   

                  Team  NOC        Games  Year  Season            City  \
0                China  CHN  1992 Summer  1992  Summer       Barcelona   
1                China  CHN  2012 Summer  2012  Summer          London   
2              Denmark  DEN  1920 Summer  1920  Summer       Antwerpen   
3       Denmark/Sweden  DEN  1900 Summer  1900  Summer           Paris   
4          Netherlands  NED  1988 Winter  1988  Winter         Calgary   
...                ...  ...          ...   ...     ...             ...   
271111        Poland-1  POL  1976 Winter  1976  Winter       Innsbruck   
271112          Poland  POL  2014 Winter  2014  Winter           Sochi   
271113          Poland  POL  2014 Winter  2014  Winter           Sochi   
271114          Poland  POL  1998 Winter  1998  Winter          Nagano   
271115          Poland  POL  2002 Winter  2002  Winter  Salt Lake City   

                Sport                                     Event Medal  
0          Basketball               Basketball Men's Basketball   NaN  
1                Judo              Judo Men's Extra-Lightweight   NaN  
2            Football                   Football Men's Football   NaN  
3          Tug-Of-War               Tug-Of-War Men's Tug-Of-War  Gold  
4       Speed Skating          Speed Skating Women's 500 metres   NaN  
...               ...                                       ...   ...  
271111           Luge                Luge Mixed (Men)'s Doubles   NaN  
271112    Ski Jumping  Ski Jumping Men's Large Hill, Individual   NaN  
271113    Ski Jumping        Ski Jumping Men's Large Hill, Team   NaN  
271114      Bobsleigh                      Bobsleigh Men's Four   NaN  
271115      Bobsleigh                      Bobsleigh Men's Four   NaN  

[271116 rows x 15 columns]>
In [22]:
sport_counts = df['Sport'].value_counts().reset_index()
sport_counts.columns = ['Sport', 'Count']
custom_colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2', '#7f7f7f']

fig = px.bar(sport_counts, x='Sport', y='Count', title='Sports', 
             color='Sport', color_discrete_sequence=custom_colors)

fig.update_layout(
    xaxis_title='Sport',
    yaxis_title='Count',
    xaxis_tickangle=-45,
    font=dict(size=12),
    title_font_size=16,
    title_x=0.5
)

fig.show()
In [37]:
medal_counts = df['Medal'].value_counts()
medal_counts = medal_counts[medal_counts.index.isin(['Gold', 'Silver', 'Bronze'])].reset_index()
medal_counts.columns = ['Medal', 'Count']

custom_colors = ['#F0A500', '#F26D3F', '#D43F5E'] 

fig = px.bar(medal_counts, x='Medal', y='Count', title='Medals',
             color='Medal', color_discrete_sequence=custom_colors)

fig.update_layout(
    xaxis_title='Medal',
    yaxis_title='Count',
    font=dict(size=12),
    title_font_size=16,
    title_x=0.5,
    width=600,  
    height=500   
)

fig.show()
In [41]:
sport_counts = sport_counts.head(10)

sport_counts = sport_counts.sort_values(by='Count', ascending=True)

fig = px.bar(sport_counts, y='Sport', x='Count', title='Top 10 Sports',
             orientation='h', color='Count', color_continuous_scale='Viridis')

fig.update_layout(
    xaxis_title='Count',
    yaxis_title='Sport',
    font=dict(size=12),
    title_font_size=16,
    title_x=0.5,
    width=800,  
    height=500   
)

fig.show()
In [46]:
sport_team_distribution = df.groupby(['Sport', 'Team']).size().reset_index(name='Count')

top_sports = sport_team_distribution.groupby('Sport')['Count'].sum().nlargest(10).index
top_sport_team_distribution = sport_team_distribution[sport_team_distribution['Sport'].isin(top_sports)]

top_sport_team_distribution = top_sport_team_distribution.sort_values(by='Count', ascending=True)

fig = px.bar(top_sport_team_distribution, y='Sport', x='Count', color='Team',
             orientation='h',
             labels={'Sport': 'Sport', 'Count': 'Number of Events'},
             title='Team Sports vs Individual Sports',
             color_discrete_sequence=px.colors.qualitative.Vivid)  

fig.update_layout(width=800, height=600, font=dict(size=14))

fig.show()
In [58]:
medals_only =df[df['Medal'].isin(['Gold', 'Silver', 'Bronze'])]

country_medal_distribution = medals_only.groupby(['NOC', 'Medal']).size().reset_index(name='Count')

top_countries = country_medal_distribution.groupby('NOC')['Count'].sum().nlargest(10).index
top_country_medal_distribution = country_medal_distribution[country_medal_distribution['NOC'].isin(top_countries)]
top_country_medal_distribution = top_country_medal_distribution.sort_values(by=['NOC', 'Count'])


fig = px.bar(top_country_medal_distribution, x='Count', y='NOC', color='Medal',
             labels={'NOC': 'Country', 'Count': 'Number of Medals', 'medal': 'Medal Type'},
             title='Countries by Medal Distribution',
             orientation='h', 
             color_discrete_sequence=['#FFD700', '#C0C0C0', '#CD7F32'])

fig.update_layout(barmode='stack', width=800, height=600, font=dict(size=14))

fig.show()
In [61]:
top_countries = country_medal_distribution.groupby('NOC')['Count'].sum().nlargest(10).index

top_sports = medals_only.groupby('Sport')['NOC'].count().nlargest(5).index

top_country_sport_distribution = medals_only[medals_only['NOC'].isin(top_countries) & medals_only['Sport'].isin(top_sports)]

country_sport_medal_distribution = top_country_sport_distribution.groupby(['NOC', 'Sport']).size().reset_index(name='Count')

fig = px.bar(country_sport_medal_distribution, x='Count', y='NOC', color='Sport',
             labels={'NOC': 'Country', 'Count': 'Number of Medals', 'Sport': 'Sport'},
             title='Countries by Sport and Medal Distribution',
             orientation='h')

fig.update_layout(barmode='stack', width=900, height=700, font=dict(size=14))

fig.show()
In [63]:
medals_only = df[df['Medal'].isin(['Gold', 'Silver', 'Bronze'])]

event_medal_distribution = medals_only.groupby('Event')['Medal'].count().reset_index(name='Count')

top_events = event_medal_distribution.nlargest(10, 'Count')

fig = px.bar(top_events, x='Event', y='Count',
             labels={'Event': 'Event', 'Count': 'Medals'},
             title='Events by Medal Distribution',
             color='Count', 
             color_continuous_scale='Viridis')

fig.update_layout(xaxis_title='Event', yaxis_title='Medals', width=900, height=600, font=dict(size=14))

fig.show()
In [68]:
Sex_dis = df['Sex'].value_counts().reset_index()
Sex_dis.columns = ['Sex', 'Count']
Sex_dis = Sex_dis.head(10)


fig = px.bar(Sex_dis, x='Sex', y='Count', color='Sex',
             title='Top 10 Categories',
             labels={'Count': 'Count'},
             color_discrete_sequence=px.colors.qualitative.Plotly)

fig.update_layout(width=400, height=500, font_size=14)

fig.show()
In [ ]: